#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 18 17:52:45 2018

@author: jia.liu
"""
import numpy as np
import pandas as pd

def my_drop_high_corr(df,threshold=0.9):
    '''
    去除和其它列中的某列，相关性过大的列（皮尔森相关系数）
    '''
    corr_matrix = df.corr().abs()
    corr_matrix.head()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    upper.head()
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    
    df = df.drop(columns = to_drop)
    return df
if __name__ == '__main__':
    data = np.random.randn(6, 4)
    df = pd.DataFrame(data,columns=['a','b','c','d'])
    my_drop_high_corr(df,threshold=0.6)
